/****************************************************************************************************
* Title: Workhorses of Opportunity, Howard and Weinstein
*
* Inputs:
*   - justnormasylum.dta
*   - county_outcomes_dta.dta
*
* Outputs:
*   - principal_components_labor.pdf
*   - principal_components_educ.pdf
*   - principal_components_social.pdf
*   - principal_components.pdf
*
* Description:
*   This script calculates PCA-based indices (labor market, education, social outcomes) from county-
*   level Chetty data and evaluates the effect of normal school counties on each index across income
*   percentiles. It also applies multiple hypothesis testing correction (Romano-Wolf) on the composite index.
****************************************************************************************************/

* Prepare normal school flag
clear 
use "justnormasylum"
rename hasnormalschool hasnormal
keep hasnormal cty_fips 
tempfile normal
save `normal'

* Setup
clear
clear matrix
clear mata
set maxvar 15000

* Merge outcome data with normal flag
clear
use county_outcomes_dta
gen cty_fips = state*1000 + county
rename state statefips
rename county countychetty
merge 1:m cty_fips using `normal', keep(3)

* Harmonize teen birth variable
foreach percentile in p1 p25 p50 p75 p100 {
	gen teenbrth_pooled_pooled_`percentile' = teenbrth_pooled_female_`percentile'
}

* Keep relevant variables
keep coll_pooled_pooled_* somecoll_pooled_pooled_* hs_pooled_pooled_* ///
	 married_pooled_pooled_* teenbrth_pooled_pooled_* jail_pooled_pooled_* ///
	 staycz_pooled_pooled_* stayhome_pooled_pooled_* kfr_pooled_pooled_* ///
	 working_pooled_pooled_* kir_pooled_pooled_* cty_fips hasnormal statefips
drop *_mean *_n *_se *p10

* Reshape to long form
reshape long coll_pooled_pooled_ somecoll_pooled_pooled_ hs_pooled_pooled_ ///
	married_pooled_pooled_ teenbrth_pooled_pooled_ jail_pooled_pooled_ ///
	staycz_pooled_pooled_ stayhome_pooled_pooled_ kfr_pooled_pooled_ ///
	working_pooled_pooled_ kir_pooled_pooled_, i(cty_fips) j(percentile) string

* Construct principal components for overall, educational, social, and labor outcomes
pca coll_pooled_pooled_ somecoll_pooled_pooled_ hs_pooled_pooled_ ///
	married_pooled_pooled_ teenbrth_pooled_pooled_ jail_pooled_pooled_ ///
	staycz_pooled_pooled_ stayhome_pooled_pooled_ kfr_pooled_pooled_ ///
	working_pooled_pooled_ kir_pooled_pooled_
predict pr_comp_pooled_pooled_

pca coll_pooled_pooled_ somecoll_pooled_pooled_ hs_pooled_pooled_
predict pca_educ_pooled_pooled_

pca married_pooled_pooled_ teenbrth_pooled_pooled jail_pooled_pooled_ ///
	staycz_pooled_pooled_ stayhome_pooled_pooled_
predict pca_social_pooled_pooled_

pca kfr_pooled_pooled_ working_pooled_pooled_ kir_pooled_pooled_
predict pca_labor_pooled_pooled_

* Reshape back to wide form
reshape wide coll_pooled_pooled_ somecoll_pooled_pooled_ hs_pooled_pooled_ ///
	married_pooled_pooled_ teenbrth_pooled_pooled jail_pooled_pooled_ ///
	staycz_pooled_pooled_ stayhome_pooled_pooled_ kfr_pooled_pooled_ ///
	working_pooled_pooled_ kir_pooled_pooled_ pr_comp_pooled_pooled ///
	pca_educ_pooled_pooled_ pca_social_pooled_pooled pca_labor_pooled_pooled_, ///
	i(cty_fips) j(percentile) string

* Estimate effect of normal school county on each component index
tempfile results
foreach v in labor educ social {
	foreach p in 1 25 50 75 100 {
		reg pca_`v'_pooled_pooled_p`p' hasnormal, absorb(statefips) cluster(statefips)
		preserve
			parmest, fast
			drop if parm == "_cons"
			gen percentile = `p'
			gen outcome = "`v'"
			cap append using `results'
			save `results', replace
		restore
	}
	
	* Plot results for each component
	preserve
		use `results', clear
		gen min90 = -invttail(dof, .05)*stderr + estimate
		gen max90 =  invttail(dof, .05)*stderr + estimate

		twoway rspike min95 max95 percentile if outcome == "`v'" || ///
			rcap min90 max90 percentile if outcome == "`v'", color(dkgreen) || ///
			scatter estimate percentile if outcome == "`v'", mcolor(dkgreen) ///
			yline(0, lpattern(dash) lcolor(gs8)) ///
			ytitle("Effect of Normal County (Spikes)") ///
			xtitle("Parents' Income Percentile") legend(off) ///
			xlabel(0 25 50 75 100)
		graph export "principal_components_`v'.pdf", as(pdf) replace
	restore
}

* Rename variables for consistency
rename *pooled_pooled* *pp*

* Test composite PCA index across percentiles
tempfile results2
foreach p in 1 25 50 75 100 {
	reg pr_comp_pp_p`p' hasnormal, absorb(statefips) cluster(statefips)
	preserve
		parmest, fast
		drop if parm == "_cons"
		gen percentile = `p'
		cap append using `results2'
		save `results2', replace
	restore
}

* Plot main PCA index effects
preserve
	use `results2', clear
	gen min90 = -invttail(dof, .05)*stderr + estimate
	gen max90 =  invttail(dof, .05)*stderr + estimate

	twoway rspike min95 max95 percentile || ///
		rcap min90 max90 percentile, color(dkgreen) || ///
		scatter estimate percentile, mcolor(dkgreen) ///
		yline(0, lpattern(dash) lcolor(gs8)) ///
		ytitle("Effect of Normal County (Spikes)") ///
		xtitle("Parents' Income Percentile") legend(off) ///
		xlabel(0 25 50 75 100)
	graph export "principal_components.pdf", as(pdf) replace
restore

* Apply Romano-Wolf multiple testing correction to composite index
rwolf pr_comp_pp_p1 pr_comp_pp_p25 ///
	pr_comp_pp_p50 pr_comp_pp_p75 pr_comp_pp_p100, ///
	indepvar(hasnormal) method(regress) controls(i.statefips) seed(7) ///
	cluster(statefips) vce(cluster statefips) reps(1000)
